rm(list = ls())
require(readr)
require(tidyr)
require(dplyr)

State and County FIPS Codes

state_fips <- read.table('data/raw/state_fips.txt', sep = '|', header = T)
county_fips <- read.table('data/raw/county_fips.txt', sep = '|', quote = '', header = T)
county_fips$fips <- 1000 * county_fips$STATEFP + county_fips$COUNTYFP
fips <- left_join(county_fips, state_fips, by = c('STATE', 'STATEFP'))
fips

Voting Eligible Population by County

population <- read.csv('data/raw/voting_eligible_population.csv', col.names = c('County', 'MaleAdultNative', 'MaleAdultNaturalized', 'FemaleAdultNative', 'FemaleAdultNaturalized'))
population[, 2:5] <- apply(population[, 2:5], 2, parse_number)
population$Population <- apply(population[, 2:5], 1, sum)
for (i in rownames(population)) {
  if (is.na(population[i, 'Population'])) {
    population[i, 'Population'] <- population[as.character(as.numeric(i) + 1), 'Population']
  }
}
population <- population %>%
  subset(County != population[2, 'County']) %>%
  separate_wider_delim(County, ', ', names = c('County', 'State'))

# Left join to get FIPS codes (various territories are not included)
population <- left_join(population, fips, by = c('County' = 'COUNTYNAME', 'State' = 'STATE_NAME'))

population

Voting Returns

# Removing Alaska, Federal Precinct, RI (seems to be overseas ballots or something), and Kansas City, MO (seems to be already included in other counties)
returns <- read.csv('data/raw/countypres_2000-2020.csv') %>%
  subset(year == 2020 & state != 'ALASKA' & !(county_name %in% c('KANSAS CITY', 'FEDERAL PRECINCT'))) %>%
  group_by(county_fips, county_name, state) %>%
  summarize(totalvotes = mean(totalvotes))
## `summarise()` has grouped output by 'county_fips', 'county_name'. You can
## override using the `.groups` argument.
colnames(returns)[1] <- 'fips'

# Alaska's voting results are by voting district, not county equivalent, so Alaska data is pulled from a different source
alaska <- read.csv('data/raw/alaska.csv') %>%
  left_join(fips, by = c('county_name' = 'COUNTYNAME'))

returns <- rbind(returns, alaska) %>%
  select(fips, county_name, state, totalvotes)

returns

Calculating Voter Turnout Rate by County

# Left join removes Kalawao County, Hawaii (no election results) and Puerto Rico (can't vote in presidential election)
returns <- left_join(returns, population, by = 'fips')

# Valdez-Cordova Census Area (FIPS 2261, only in `characteristics`) was split into Chugach Census Area (FIPS 2063, only in `returns`) and Copper River Census Area (FIPS 2066, only in `returns`) - they will be recombined here
valdez.cordova <- subset(returns, fips %in% c(2063, 2066))
returns <- rbind(returns, data.frame(
  fips = 2261,
  State = 'Alaska',
  County = 'Valdez-Cordova Census Area',
  totalvotes = sum(valdez.cordova$totalvotes),
  Population = sum(valdez.cordova$Population)
)) %>%
  subset(!(fips %in% c(2063, 2066)))

returns$turnout.rate <- returns$totalvotes / returns$Population

returns
# Wade-Hampton Census Area (FIPS 2270, only in `characteristics`) is now Kusilvak Census Area (FIPS 2158, in `characteristics` and `returns`) - FIPS changed accordingly and data is merged between entries
# Kalawao County, Hawaii (FIPS 15005) and Puerto Rican municipios (FIPS 72xxx) are in `characteristics` but not in `returns`, so we drop them
characteristics <- read.csv('data/raw/cty_covariates.csv')
characteristics <- characteristics %>%
  mutate(
    fips = case_when(state == 2 & county == 270 ~ 2158, .default = 1000 * state + county),
    ann_avg_job_growth_2004_2013 = case_when(state == 2 & county == 270 ~ subset(characteristics, state == 2 & county == 158)$ann_avg_job_growth_2004_2013, .default = ann_avg_job_growth_2004_2013)
  ) %>%
  subset(fips != 15005 & fips < 72000 & !(state == 2 & county == 158))
characteristics
data <- full_join(characteristics, returns, by = 'fips') %>%
  select(
    State,
    County,
    fips,
    frac_coll_plus2010,
    foreign_share2010,
    med_hhinc2016,
    poor_share2010,
    share_white2010,
    share_black2010,
    share_hisp2010,
    share_asian2010,
    gsmn_math_g3_2013,
    rent_twobed2015,
    singleparent_share2010,
    traveltime15_2010,
    emp2000,
    ln_wage_growth_hs_grad,
    popdensity2010,
    ann_avg_job_growth_2004_2013,
    job_density_2013,
    turnout.rate
  )

data
write.csv(data, 'data/processed/data.csv')